Investigate intra distances for Doliolida

Author

Thelma Panaïotis

Prepare data

Load distances

load("data/02a.f_val_dist_small.Rdata")
load("data/02b.rq_coef_small.Rdata")

## Intra distances
# list processed files
processed <- list.files("data", pattern = "04\\.", full.names = TRUE)

# load data
res <- sapply(processed, function(x) mget(load(x)), simplify = TRUE)

# get summary data (1st line)
df_intra <- res[1,] %>% bind_rows()
# get distances (2nd line)
df_intra_dist <- res[2,] %>% bind_rows()

## Drop colonial Collodaria because of segmentation artifact
df_intra <- df_intra %>% filter(!str_detect(taxon, "Collodaria_colonial"))
df_intra_dist <- df_intra_dist %>% filter(!str_detect(taxon, "Collodaria_colonial"))

## Assign colour and shape to each taxon
df_intra <- df_intra %>% 
  mutate(
    colour = as.character(paletteer_d(`"khroma::discreterainbow"`, n = 27)),
    shape = rep(21:25, 6)[1:27]
  )

Process null distances

# Apply log transformation for plotting
f_val_dist <- f_val_dist %>% 
  mutate(
    log_n_dist = log10(n_dist),
    log_test_stat = log10(test_stat)
  )

## Generate data to plot a ribbon between the regression lines
# limits for n_dist
lim_dist <- c(50, 2e9)
# Generate ribbon
rib_data <- tibble(n_dist = lim_dist) %>% 
  mutate(
    # apply log-transformation
    log_n_dist = log10(n_dist),
    # compute estimated kuiper-stat from slope and intercept
    ymin = rq_coef %>% filter(tau == 0.05) %>% pull(slope) * log_n_dist + rq_coef %>% filter(tau == 0.05) %>% pull(intercept),
    ymax = rq_coef %>% filter(tau == 0.95) %>% pull(slope) * log_n_dist + rq_coef %>% filter(tau == 0.95) %>% pull(intercept)
  ) %>% 
  # reformat and reorder to plot a polygon
  pivot_longer(ymin:ymax, values_to = "y") %>% 
  mutate(order = c(1, 2, 4, 3)) %>% 
  arrange(order)

Process plankton distances

df_intra_dist_small <- df_intra_dist %>% 
  pivot_longer(dist:dist_rand) %>% 
  mutate(value = value * 51 / 10000) %>% 
  left_join(plankton_esd, by = join_by(taxon)) #%>% 

# Perform Kuiper test using only small distances
df_intra_small <- mclapply(df_intra$taxon, function (ta) {
  # Get distances
  d1 <- df_intra_dist_small %>% filter(taxon == ta) %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- df_intra_dist_small %>% filter(taxon == ta) %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Number of distances
  n_dist_small <- (length(d1) + length(d2)) / 2
  
  # Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  return(
    tibble(
      taxon = ta,
      prop_dist_small = n_dist_small / 10000, # proportion of small distances based on the quantiles
      test_stat = kt[1],
      p_value = kt[2]
    ))
  
}, mc.cores = n_cores) %>% 
  bind_rows()

# compute number of small distances based on proportion of small distances and the total number of distances
df_intra_small <- df_intra %>% 
  select(taxon, n_dist, colour, shape) %>% 
  left_join(df_intra_small, by = join_by(taxon)) %>% 
  mutate(
    n_dist_small = round(n_dist * prop_dist_small), # total number of small distances
    log_n_dist_small = log10(n_dist_small), # log transform number of small distances
    log_test_stat = log10(test_stat) # log transform test stat
  ) %>% 
  filter(n_dist_small > 100) # keep only taxons with at least 100 small distances

# Detect taxa which are above the ribbon and have the minimum number of distances required
df_intra_small <- df_intra_small %>% 
  mutate(
    above = log_test_stat > log_n_dist_small * (rq_coef %>% filter(tau == 0.95) %>% pull(slope)) + (rq_coef %>% filter(tau == 0.95) %>% pull(intercept)), # above the polygon
    keep = n_dist_small > n_dist_min, # enough distances
    above = above & keep # combination of both
  ) %>% 
  select(-keep)
df_intra_small_above <- df_intra_small %>% filter(above) # needed to keep the same colour and shape across plots

Results

Distribution of distances

df_intra_dist_small <- df_intra_dist_small %>% 
  mutate(name = ifelse(name == "dist", "plankton", "null"))

p1a <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "All distances") +
  theme_classic()

p2a <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 20) %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 20") +
  theme_classic()

p3a <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 10) %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 10") +
  theme_classic()

p4a <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 5) %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 5") +
  theme_classic()

p1a + p2a + p3a + p4a

Important

There is this weird drop at the right of the distribution when filtering distances.

What happens if we clip the x axis instead of filtering distances? Let’s try it for 20 cm.

df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
  scale_x_continuous(limits = c(0, 20)) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Clip 20") +
  theme_classic()

Same as filtering distances.

ECDFs

Let’s plot corresponding ECDFs.

p1b <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  ggplot() +
  stat_ecdf(aes(x = value, colour = name)) +
  geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "All distances") +
  theme_classic()

p2b <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 20) %>% 
  ggplot() +
  stat_ecdf(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 20") +
  theme_classic()

p3b <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 10) %>% 
  ggplot() +
  stat_ecdf(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 10") +
  theme_classic()

p4b <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  filter(value < 5) %>% 
  ggplot() +
  stat_ecdf(aes(x = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 5") +
  theme_classic()

p1b + p2b + p3b + p4b

Let’s plot ECDFs alongside with distributions.

(p2a + geom_vline(xintercept = 7.35, alpha = 0.5, linewidth = 0.5, colour = "grey")) + 
  (p3a + geom_vline(xintercept = 6.15, alpha = 0.5, linewidth = 0.5, colour = "grey")) + 
  p2b + 
  p3b

Important

Here we notice that the point were distributions meet vary in x depending on distance filtering. Is this expected?

Now we want to have a look at the differences between ECDFs to establish whether organisms are closer or further than expected. Thus, we need to compute the ECDFs, and not only plot them.

Compute ECDFs

Note

ECDFs are computed on a x axis vector from 0 to max(dist) with a length of 1000.

Filtering after

Let’s start by computing the ECDF using all distances and then filter to restrict it to smaller distances.

# Reshape df to have one row per taxon
df_wide <- df_intra_dist_small %>% 
  filter(taxon == "Doliolida") %>% 
  pivot_wider(names_from = "name", values_from = "value", values_fn = list)

# Compute ecdf
# Needs a nested call: ecdf(x) returns a function, x being the values of interest
# Call this function with argument x2 to get values along x2
x_axis_seq <- seq(0, 70, length.out = 1000)
plank_ecdf <- ecdf(unlist(df_wide$plankton))(x_axis_seq)
null_ecdf <- ecdf(unlist(df_wide$null))(x_axis_seq)

# Store ECDFs and their difference
ecdfs <- tibble(
  x_axis_seq = x_axis_seq, # we need this as the x axis
  plank_ecdf = plank_ecdf,
  null_ecdf = null_ecdf,
  diff = plank_ecdf - null_ecdf
) %>% 
  mutate(rank = row_number()) # not necessarly needed

# Check that generated ECDFs are the same as the plotted ones
p1c <- ecdfs %>% 
  select(x_axis_seq, plank_ecdf, null_ecdf) %>% 
  pivot_longer(plank_ecdf:null_ecdf) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "All distances") +
  theme_classic()

p2c <- ecdfs %>% 
  select(x_axis_seq, plank_ecdf, null_ecdf) %>% 
  pivot_longer(plank_ecdf:null_ecdf) %>% 
  filter(x_axis_seq < 20) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 20") +
  theme_classic()

p3c <- ecdfs %>% 
  select(x_axis_seq, plank_ecdf, null_ecdf) %>% 
  pivot_longer(plank_ecdf:null_ecdf) %>% 
  filter(x_axis_seq < 10) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 10") +
  theme_classic()

p4c <- ecdfs %>% 
  select(x_axis_seq, plank_ecdf, null_ecdf) %>% 
  pivot_longer(plank_ecdf:null_ecdf) %>% 
  filter(x_axis_seq < 5) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 5") +
  theme_classic()

p1c + p2c + p3c + p4c

Let’s compare with our previous ECDFs.

(p2b + p3b) / (p2c + p3c)

Our computed ECDFs were artificially cut so they do not go up to 1 and are shifted. Rescaling them would probably fix this issue.

Important

Computing ECDFs from all distances and then filtering them later on distances does not seem appropriate. Let’s try computing ECDFs from filtered distances.

Filtering before

Second version: we first filter distances and them compute ECDFs on filtered distances.

## Generate ECDFs for all distance thresholds
dist_thres <- tibble(
    dist_thr_cm = c(70, 20, 10, 5),
    name = c("All distances", "Below 20", "Below 10", "Below 5")
  ) %>% 
  mutate(name = factor(name, levels = c("All distances", "Below 20", "Below 10", "Below 5")))

# Loop over thresholds
ecdfs <- lapply(1:nrow(dist_thres), function(i) {
  # Get distance threshold and plot name
  r <- dist_thres %>% slice(i)
  
  # Generate the wide dataset and apply threshold
  df_wide_r <- df_intra_dist_small %>% 
    filter(taxon == "Doliolida") %>% 
    filter(value < r$dist_thr_cm) %>% 
    pivot_wider(names_from = "name", values_from = "value", values_fn = list)
  
  # Compute ECDF on a common x axis sequence
  x_axis_seq_r <- seq(0, r$dist_thr_cm, length.out = 1000)
  plank_ecdf_r <- ecdf(unlist(df_wide_r$plankton))(x_axis_seq_r)
  null_ecdf_r <- ecdf(unlist(df_wide_r$null))(x_axis_seq_r)
  
  # Store results
  res <- tibble(
    x_axis_seq = x_axis_seq_r,
    plank_ecdf = plank_ecdf_r,
    null_ecdf = null_ecdf_r
  ) %>% 
    mutate(
      diff = plank_ecdf - null_ecdf,
      dist_thr_cm = r$dist_thr_cm,
      name = r$name
    )
  return(res)
}) %>% 
  bind_rows()

ecdfs %>% 
  select(-diff) %>% 
  pivot_longer(plank_ecdf:null_ecdf, names_to = "type", values_to = "value") %>% 
  mutate(type == ifelse(type == "plank_ecdf", "plankton", "null")) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = type)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type") +
  facet_wrap(~name, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

Looks quite similar as the original ECDFs, let’s keep these ones. Now let’s have a look at differences between ECDFs.

ECDFs differences

ggplot(ecdfs) +
  geom_path(aes(x = x_axis_seq, y = diff), colour = "#00BFC4") +
  geom_hline(yintercept = 0, colour = "#F8766D") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_wrap(~name, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

Important

Unexpectedly, the direction of the interaction varies depending on the threshold we used on distances. If we look closer, we notice conserved patterns in plankton ECDFs (blue) across distances, but it does not cross 0 for the same distance, as if the null ECDF (red) was shifted above or below.

Doliolid updated quantiles

Read

load("data/test/04a.doliolida_5_cm.Rdata")
res_05 <- res
res_dist_05 <- res_dist

load("data/test/04a.doliolida_10_cm.Rdata")
res_10 <- res
res_dist_10 <- res_dist

load("data/test/04a.doliolida_20_cm.Rdata")
res_20 <- res
res_dist_20 <- res_dist

# Store results together
res_filt <- res_05 %>% mutate(dist_thr = 5) %>% 
  bind_rows(res_10 %>% mutate(dist_thr = 10)) %>% 
  bind_rows(res_20 %>% mutate(dist_thr = 20))

res_filt_dist <- res_dist_05 %>% mutate(dist_thr = 5) %>% 
  bind_rows(res_dist_10 %>% mutate(dist_thr = 10)) %>% 
  bind_rows(res_dist_20 %>% mutate(dist_thr = 20)) %>% 
  rename(plankton = dist, null = dist_rand) %>% 
  # convert pix to cm
  mutate(
    plankton = plankton * 51 / 10000,
    null = null * 51 / 10000
  )

Distributions

res_filt_dist %>% 
  pivot_longer(plankton:null) %>% 
  ggplot() +
  geom_density(aes(x = value, colour = name)) +
  facet_wrap(~dist_thr, scales = "free", ncol = 2) +
  labs(x = "Distance (cm)", y = "Density", colour = "Type") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

ECDF

dist_thres <- c(5, 10, 20)
# Loop over thresholds
ecdfs_filt <- lapply(dist_thres, function(t) {
  # Get distance threshold and plot name
  t_dist <- res_filt_dist %>% filter(dist_thr == t)
  
  # Compute ECDF on a common x axis sequence
  x_axis_seq_r <- seq(0, t, length.out = 1000)
  plank_ecdf_r <- ecdf(unlist(t_dist$plankton))(x_axis_seq_r)
  null_ecdf_r <- ecdf(unlist(t_dist$null))(x_axis_seq_r)
  
  # Store results
  res <- tibble(
    x_axis_seq = x_axis_seq_r,
    plank_ecdf = plank_ecdf_r,
    null_ecdf = null_ecdf_r
  ) %>% 
    mutate(
      diff = plank_ecdf - null_ecdf,
      dist_thr_cm = t
    )
  return(res)
}) %>% 
  bind_rows()

ecdfs_filt %>% 
  select(-diff) %>% 
  pivot_longer(plank_ecdf:null_ecdf, names_to = "type", values_to = "value") %>% 
  mutate(type == ifelse(type == "plank_ecdf", "plankton", "null")) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = value, colour = type)) +
  labs(x = "Distance (cm)", y = "ECDF", colour = "Type") +
  facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

ECDFs differences

ggplot(ecdfs_filt) +
  geom_path(aes(x = x_axis_seq, y = diff), colour = "#00BFC4") +
  geom_hline(yintercept = 0, colour = "#F8766D") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

Compare full quantiles and <20 quantiles

# Store together ECDFs from all quantiles and from <20 quantiles
ecdfs_qt <- ecdfs %>% 
  select(-name) %>% 
  mutate(quantile = "full") %>% 
  bind_rows(ecdfs_filt %>% mutate(quantile = "< 20")) %>% 
  filter(dist_thr_cm < 70) # drop the case with all distances

ggplot(ecdfs_qt) + 
  geom_path(aes(x = x_axis_seq, y = diff, colour = quantile), alpha = 0.5) +
  geom_hline(yintercept = 0, colour = "red") +
  facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
    labs(
    x = "Distance (cm)", y = "Plankton ECDF - Null ECDF", colour = "Quantiles",
    title = "ECDFs dist for various distance threshold and quantile threshold"
  )

As expected, full quantiles are noisier. Let’s try some smoothing with a moving average.

ecdfs_qt %>% 
  group_by(dist_thr_cm, quantile) %>% 
  mutate(mmean = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20))) %>% 
  ungroup() %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = diff, colour = quantile), alpha = 0.2) +
  geom_path(aes(x = x_axis_seq, y = mmean, colour = quantile)) +
  geom_hline(yintercept = 0, colour = "red") +
  facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
  labs(
    x = "Distance (cm)", y = "Plankton ECDF - Null ECDF", colour = "Quantiles",
    title = "ECDFs dist for various distance threshold and quantile threshold"
  )

All organisms: full quantiles and various distance thresholds

Read distances

## Intra distances
# list processed files
processed <- list.files("data", pattern = "04\\.", full.names = TRUE)

# load data
res <- sapply(processed, function(x) mget(load(x)), simplify = TRUE)

# get summary data (1st line)
df_intra <- res[1,] %>% bind_rows()
# get distances (2nd line)
df_intra_dist <- res[2,] %>% 
  bind_rows() %>% 
  # convert from px to cm
  mutate(
    dist = dist * 51 / 10000,
    dist_rand = dist_rand * 51 / 10000
  )

## Drop colonial Collodaria because of segmentation artifact
df_intra <- df_intra %>% filter(!str_detect(taxon, "Collodaria_colonial"))
df_intra_dist <- df_intra_dist %>% filter(!str_detect(taxon, "Collodaria_colonial"))

## Assign colour and shape to each taxon
df_intra <- df_intra %>% 
  mutate(
    colour = as.character(paletteer_d(`"khroma::discreterainbow"`, n = 27)),
    shape = rep(21:25, 6)[1:27]
  )

# List taxonomic groups
taxa <- unique(df_intra_dist$taxon)

Filter distances and compute ECDFs

# Loop over taxonomic groups
ecdf_dist_all <- lapply(taxa, function(ta) {
  # Get distances for this taxon
  df_ta <- df_intra_dist %>% filter(taxon == ta)
  
  # Loop over distance thresholds
  lapply(dist_thres, function(d) {
    # Keep only distances below threshold
    df_ta_d <- df_ta %>% 
      pivot_longer(dist:dist_rand) %>% 
      filter(value < d)
    
    # Compute ECDF
    x_axis_seq <- seq(0, d, length.out = 1000)
    plank_dist <- df_ta_d %>% filter(name == "dist") %>% pull(value)
    null_dist <- df_ta_d %>% filter(name == "dist_rand") %>% pull(value)
    plank_ecdf <- ecdf(plank_dist)(x_axis_seq)
    null_ecdf <- ecdf(null_dist)(x_axis_seq)
    
    # Store results
    res <- tibble(
      taxon = rep(ta, length.out = length(x_axis_seq)),
      dist_thr = rep(d, length.out = length(x_axis_seq)) %>% as.factor(),
      x_axis_seq = x_axis_seq,
      plank_ecdf = plank_ecdf,
      null_ecdf = null_ecdf
    ) %>% 
      mutate(
        # compute difference
        diff = plank_ecdf - null_ecdf,
        # smooth it
        diff_sm = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20)),
        # compute sign of deviation
        dir = sign(diff_sm) %>% as.factor()
      )
  
    return(res)  
    
  }) %>% 
    bind_rows()
}) %>% 
  bind_rows()

# Generate a summary for each distance threshold
# - prop closer in each dir
ecdf_dist_all_summ <- ecdf_dist_all %>% 
  group_by(taxon, dist_thr) %>% 
  summarise(
    uni_d = length(unique(dir)) == 1,
    prop_closer = sum(dir == 1) / n(),
    sum_ecdf = sum(diff_sm),
    .groups = "drop"
  )

Plot differences in ECDFs for all thresholds

Proportion of organisms that are closer

ggplot(ecdf_dist_all_summ) + 
  geom_raster(aes(x = taxon, y = dist_thr, fill = prop_closer)) +
  geom_point(data = ecdf_dist_all_summ %>% filter(uni_d), aes(x = taxon, y = dist_thr)) +
  scale_fill_gradient2(
    low = "#d7191c", high = "#2c7bb6", mid = "#ffffbf",
    midpoint = 0.5, limits = c(0, 1)
    ) +
  coord_flip()

Sum of ECDF differences

ggplot(ecdf_dist_all_summ) + 
  geom_raster(aes(x = taxon, y = dist_thr, fill = sum_ecdf)) +
  geom_point(data = ecdf_dist_all_summ %>% filter(uni_d), aes(x = taxon, y = dist_thr)) +
  scale_fill_gradient2(
    low = "#d7191c", high = "#2c7bb6", mid = "#ffffbf",
    limits = c(-50, 50)
    ) +
  coord_flip()

Distribution of ECDF differences

ecdf_dist_all %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = diff_sm)) +
  geom_area(aes(x = x_axis_seq, y = ifelse(diff_sm < 0, diff_sm, 0)), fill = "#d7191c") +
  geom_area(aes(x = x_axis_seq, y = ifelse(diff_sm > 0, diff_sm, 0)), fill = "#2c7bb6") +
  geom_hline(yintercept = 0, colour = "grey") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_grid(taxon~dist_thr, scales = "free", switch = 'y') +
  theme(
    axis.text.y = element_blank(),
    strip.text.y.left = element_text(angle = 0)
  )

Try identifying patterns in ECDFs differences for the 10 cm threshold

For this we used non smoothed differences.

ecdf_10 <- ecdf_dist_all %>% filter(dist_thr == 10)#%>% filter(taxon %in% taxa[1:5])
ecdf_10 %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = diff)) +
  #geom_area(aes(x = x_axis_seq, y = ifelse(diff < 0, diff, 0)), fill = "#d7191c") +
  #geom_area(aes(x = x_axis_seq, y = ifelse(diff > 0, diff, 0)), fill = "#2c7bb6") +
  geom_hline(yintercept = 0, colour = "grey") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_grid(taxon~dist_thr, scales = "free", switch = 'y') +
  theme(
    axis.text.y = element_blank(),
    strip.text.y.left = element_text(angle = 0)
  )

ecdf_10 %>% 
  group_by(taxon, dist_thr) %>% 
  mutate(
    mmed = slide(diff, k = 50, fun = median, na.rm = TRUE, n = 5),
  ) %>% 
  ungroup() %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = mmed)) +
  #geom_area(aes(x = x_axis_seq, y = ifelse(mmed < 0, mmed, 0)), fill = "#d7191c") +
  #geom_area(aes(x = x_axis_seq, y = ifelse(mmed > 0, mmed, 0)), fill = "#2c7bb6") +
  geom_hline(yintercept = 0, colour = "grey") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
  theme(
    axis.text.y = element_blank(),
    strip.text.y.left = element_text(angle = 0)
  )

Try identifying a distance threshold

Doliolids

Coarse

Start with coarse thresholds: 10, 20, 30, 40, 50 and 60.

# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Loop over thresholds, filter distances and perform Kuiper test

dol_thres_c <- lapply(thresholds_c, function(thres) {
  # Filter distances
  thres_dist <- df_intra_dist %>% 
    filter(taxon == "Doliolida") %>% 
    pivot_longer(dist:dist_rand) %>% 
    filter(value < thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = "Doliolida",
    dist_thres = thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
}) %>% 
  bind_rows()

# Join with total number of distances
dol_thres_c <- dol_thres_c %>% 
  left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Plot results
ggplot(dol_thres_c) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

ggplot(dol_thres_c) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  theme_classic()

Fine

Let’s focus on distances < 20 cm and try to refine the threshold.

# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Loop over thresholds, filter distances and perform Kuiper test

dol_thres_f <- lapply(thresholds_f, function(thres) {
  # Filter distances
  thres_dist <- df_intra_dist %>% 
    filter(taxon == "Doliolida") %>% 
    pivot_longer(dist:dist_rand) %>% 
    filter(value < thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = "Doliolida",
    dist_thres = thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
}) %>% 
  bind_rows()

# Join with total number of distances
dol_thres_f <- dol_thres_f %>% 
  left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Plot results
ggplot(dol_thres_f) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

ggplot(dol_thres_f) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  theme_classic()

There is a threshold (12 cm) for which there is a maximum of information.

Let’s try this for all plankton groups, starting with the coarse thresholding.

All taxa

Coarse

# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Generate a combination of taxa by thresholds to try
comb <- crossing(taxon = taxa, thres = thresholds_c)

# Loop over combination of thresholds and taxa
taxa_thres_c <- mclapply(1:nrow(comb), function(i) {
  # Get row of interest
  r <- comb %>% slice(i)
  
  # Threshold distances
  thres_dist <- df_intra_dist %>% 
    filter(taxon == r$taxon) %>% 
    pivot_longer(dist:dist_rand) %>% 
    filter(value < r$thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = r$taxon,
    dist_thres = r$thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
  
}, mc.cores = n_cores) %>% 
  bind_rows()

# Join with total number of distances
taxa_thres_c <- taxa_thres_c %>% 
  left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Plot results
ggplot(taxa_thres_c) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  facet_wrap(~taxon, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

ggplot(taxa_thres_c) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  facet_wrap(~taxon, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

For many taxonomic groups we can identify a maximum of information around 5-10 cm. Let’s try refining that threshold between 4 and 20 cm.

Fine

# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Generate a combination of taxa by thresholds to try
comb <- crossing(taxon = taxa, thres = thresholds_f)

# Loop over combination of thresholds and taxa
taxa_thres_f <- mclapply(1:nrow(comb), function(i) {
  # Get row of interest
  r <- comb %>% slice(i)
  
  # Threshold distances
  thres_dist <- df_intra_dist %>% 
    filter(taxon == r$taxon) %>% 
    pivot_longer(dist:dist_rand) %>% 
    filter(value < r$thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = r$taxon,
    dist_thres = r$thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
  
}, mc.cores = n_cores) %>% 
  bind_rows()

# Join with total number of distances
taxa_thres_f <- taxa_thres_f %>% 
  left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Plot results
ggplot(taxa_thres_f) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  facet_wrap(~taxon, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

ggplot(taxa_thres_f) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  facet_wrap(~taxon, scales = "free") +
  theme_classic() +
  theme(strip.background = element_rect(colour = "white"))

The result is not satisfying for all groups. Let’s try the same approach on overall distances.

All distances

First we need to load this data.

load("data/03.all_distances.Rdata")

# Extract distances
df_all_dist <- df_all %>% 
  select(dist, dist_rand) %>% 
  unnest(c(dist, dist_rand))
df_all <- df_all %>% select(-c(dist, dist_rand))

Coarse

# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Loop over thresholds, filter distances and perform Kuiper test

all_thres_c <- lapply(thresholds_c, function(thres) {
  # Filter distances
  thres_dist <- df_all_dist %>% 
    pivot_longer(dist:dist_rand) %>% 
    mutate(value = value * 51 / 10000) %>% 
    filter(value < thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = "all",
    dist_thres = thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
}) %>% 
  bind_rows()

# Join with total number of distances
all_thres_c <- all_thres_c %>% 
  left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Plot results
ggplot(all_thres_c) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

ggplot(all_thres_c) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  theme_classic()

Great, we have a maximum of information for a given threshold!

Let’s try refining it!

Fine

# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Loop over thresholds, filter distances and perform Kuiper test

all_thres_f <- lapply(thresholds_f, function(thres) {
  # Filter distances
  thres_dist <- df_all_dist %>% 
    pivot_longer(dist:dist_rand) %>% 
    mutate(value = value * 51 / 10000) %>% 
    filter(value < thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = "all",
    dist_thres = thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
}) %>% 
  bind_rows()

# Join with total number of distances
all_thres_f <- all_thres_f %>% 
  left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Extract the max of information
max_info <- all_thres_f %>% arrange(desc(kuiper_stat)) %>% slice_head(n = 1)
max_info
# A tibble: 1 × 8
  taxon dist_thres n_qt_p n_qt_r kuiper_stat    n_dist  n_dist_p  n_dist_r
  <chr>      <dbl>  <int>  <int>       <dbl>     <int>     <dbl>     <dbl>
1 all           12   3038   3001      0.0168 483179289 146789868 145002105
# Plot results
ggplot(all_thres_f) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  geom_vline(xintercept = max_info$dist_thres, colour = "red") +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

ggplot(all_thres_f) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  geom_vline(xintercept = max_info$n_dist_p, colour = "red") +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  theme_classic()

Try refining even further.

Very fine

# Define list of thresholds to try
thresholds_vf <- seq(9, 15, by = 0.5)
# Loop over thresholds, filter distances and perform Kuiper test

all_thres_vf <- lapply(thresholds_vf, function(thres) {
  # Filter distances
  thres_dist <- df_all_dist %>% 
    pivot_longer(dist:dist_rand) %>% 
    mutate(value = value * 51 / 10000) %>% 
    filter(value < thres)
  
  # Extract distances
  d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
  d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
  
  # Perform Kuiper test
  kt <- kuiper_test(d1, d2)
  
  # Return results
  tibble(
    taxon = "all",
    dist_thres = thres,
    n_qt_p = length(d1),
    n_qt_r = length(d2),
    kuiper_stat = kt[1]
  )
}) %>% 
  bind_rows()

# Join with total number of distances
all_thres_vf <- all_thres_vf %>% 
  left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>% 
  mutate(
    # compute number of distances from total distances and proportion of quantiles
    n_dist_p = round(n_dist * (n_qt_p / 10000)),
    n_dist_r = round(n_dist * (n_qt_r / 10000))
  )

# Extract the max of information
max_info_vf <- all_thres_vf %>% arrange(desc(kuiper_stat)) %>% slice_head(n = 1)
max_info_vf
# A tibble: 1 × 8
  taxon dist_thres n_qt_p n_qt_r kuiper_stat    n_dist  n_dist_p  n_dist_r
  <chr>      <dbl>  <int>  <int>       <dbl>     <int>     <dbl>     <dbl>
1 all           11   2770   2729      0.0170 483179289 133840663 131859628
# Plot results
ggplot(all_thres_vf) +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  geom_vline(xintercept = max_info_vf$dist_thres, colour = "red") +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

ggplot(all_thres_vf) +
  geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
  geom_vline(xintercept = max_info_vf$n_dist_p, colour = "red") +
  labs(x = "Number of distances", y = "Kuiper statistic") +
  theme_classic()

Conclusion

Let’s focus on distances < 11 cm.

Let’s check this threshold and number of distances for all taxa.

# Get the identified threshold for each taxonomic group
taxa_thres_f_max <- taxa_thres_f %>% 
  group_by(taxon) %>% 
  filter(kuiper_stat == max(kuiper_stat)) %>% 
  ungroup()
  
ggplot(taxa_thres_f_max) +
  geom_point(aes(x = n_dist_p, y = dist_thres)) +
  scale_x_log10() +
  labs(x = "Number of distances", y = "Distance threshold (cm)")

ggplot(taxa_thres_f_max) +
  geom_point(aes(x = n_dist_p, y = kuiper_stat, colour = dist_thres)) +
  scale_colour_viridis_c() +
  scale_x_log10() + scale_y_log10() +
  labs(x = "Number of distances", y = "Kuiper statistic", colour = "Distance threshold (cm)")

A distance threshold can be identified only if enough distances are present. Recomputing distances and saving quantiles < 20 cm could help identify more thresholds.

What about the blue point at 10⁵ distances?

taxa_thres_f_max %>% filter(n_dist_p > 10000 & dist_thres == 4)
# A tibble: 1 × 8
  taxon    dist_thres n_qt_p n_qt_r kuiper_stat  n_dist n_dist_p n_dist_r
  <chr>         <dbl>  <int>  <int>       <dbl>   <int>    <dbl>    <dbl>
1 Rhizaria          4    590    579      0.0167 1520322    89699    88027
# It’s Rhizaria

taxa_thres_f %>% 
  filter(taxon == "Rhizaria") %>% 
  ggplot() +
  geom_path(aes(x = dist_thres, y = kuiper_stat)) +
  #geom_vline(xintercept = max_info$dist_thres, colour = "red") +
  labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
  theme_classic()

Now that we have a distance threshold, apply it to all taxonomic groups and try to identify patterns in ECDF differences.

Apply the 11 cm threshold

Important

We might need to recompute distances to store quantiles below a given threshold, probably a little bit higher than 11 cm.

Process plankton distances

dist_thr <- 11

# Loop over taxonomic groups, perform Kuiper test, compute ECDF
df_intra_11 <- lapply(taxa, function(ta) {
  # Get distances for this taxon
  df_ta <- df_intra_dist %>% 
    filter(taxon == ta) %>% 
    # Keep only distances below threshold
    pivot_longer(dist:dist_rand) %>% 
    filter(value < dist_thr)
  
  plank_dist <- df_ta %>% filter(name == "dist") %>% pull(value)
  null_dist <- df_ta %>% filter(name == "dist_rand") %>% pull(value)
  n_dist <- (length(plank_dist) + length(null_dist))/2
  
  # Kuiper test
  kt <- kuiper_test(plank_dist, null_dist)
  
  # Compute ECDF
  x_axis_seq <- seq(0, dist_thr, length.out = 1000)

  plank_ecdf <- ecdf(plank_dist)(x_axis_seq)
  null_ecdf <- ecdf(null_dist)(x_axis_seq)
  
  # Store results
  res <- tibble(
    taxon = ta,
    dist_thr = dist_thr,
    n_dist_small_qt = n_dist,
    test_stat = kt[1], 
    p_value = kt[2], 
    x_axis_seq = list(x_axis_seq),
    plank_ecdf = list(plank_ecdf),
    null_ecdf = list(null_ecdf)
  )

  # Return result
  return(res)
}) %>% 
  bind_rows()

# Extract ECDFs, compute difference and smooth it
df_intra_ecdf_11 <- df_intra_11 %>% 
  select(taxon, x_axis_seq, plank_ecdf, null_ecdf) %>% 
  unnest(c(x_axis_seq, plank_ecdf, null_ecdf)) %>% 
  group_by(taxon) %>% 
  mutate(
    diff = plank_ecdf - null_ecdf,
    diff_sm = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20))
  ) %>% 
  ungroup()

# Get total number of distances below threshold
df_intra_11 <- df_intra_11 %>% 
  select(-c(x_axis_seq, plank_ecdf, null_ecdf)) %>% 
  left_join(df_intra %>% select(taxon, n_dist_tot = n_dist, colour, shape), by = join_by(taxon)) %>% 
  mutate(
    n_dist_small = (n_dist_small_qt / 10000) * n_dist_tot,
    log_n_dist_small = log10(n_dist_small),
    log_test_stat = log10(test_stat)
  ) %>% 
  filter(n_dist_small > 100)

# Detect taxa which are above the ribbon and have the minimum number of distances required
df_intra_11 <- df_intra_11 %>% 
  mutate(
    above = log_test_stat > log_n_dist_small * (rq_coef %>% filter(tau == 0.95) %>% pull(slope)) + (rq_coef %>% filter(tau == 0.95) %>% pull(intercept)), # above the polygon
    keep = n_dist_small > n_dist_min, # enough distances
    above = above & keep # combination of both
    ) %>% 
  select(-keep)
df_intra_11_above <- df_intra_11 %>% filter(above) # needed to keep the same colour and shape across plots

Plot Kuiper stat VS number of distances

# With only taxa above ribbon
ggplot() +
  geom_boxplot(data = f_val_dist, aes(x = log_n_dist, y = log_test_stat, group = log_n_dist), colour = "gray", outlier.shape = NA) +
  geom_polygon(data = rib_data, aes(x = log_n_dist, y = y), alpha = 0.1) +
  geom_point(data = df_intra_11 %>% filter(above), aes(x = log_n_dist_small, y = log_test_stat, fill = taxon, colour = taxon, shape = taxon)) +
  geom_point(data = df_intra_11 %>% filter(!above), aes(x = log_n_dist_small, y = log_test_stat), colour = "gray") +
  scale_fill_manual(values = df_intra_11_above$colour) +
  scale_colour_manual(values = df_intra_11_above$colour) +
  scale_shape_manual(values = df_intra_11_above$shape) +
  scale_x_continuous(labels = label_math(expr = 10^.x, format = force), breaks = seq(2, 8, by = 2)) +
  scale_y_continuous(labels = label_math(expr = 10^.x, format = force)) +
  labs(x = "N distances", y = "Kuiper statistic", colour = "Taxon", fill = "Taxon", shape = "Taxon") 

Plot ECDFs

For all taxonomic groups.

df_intra_ecdf_11 %>% 
  mutate(random = ifelse(taxon %in% df_intra_11_above$taxon, "Non random", "Random")) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = diff_sm, colour = random)) +
  geom_hline(yintercept = 0, colour = "grey") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
  theme(
    axis.text.y = element_blank(),
    strip.text.y.left = element_text(angle = 0)
  )

And only for taxonomic groups which have non random distribution of distances.

df_intra_ecdf_11 %>% 
  filter(taxon %in% df_intra_11_above$taxon) %>% 
  ggplot() +
  geom_path(aes(x = x_axis_seq, y = diff_sm)) +
  geom_hline(yintercept = 0, colour = "grey") +
  labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
  facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
  theme(
    axis.text.y = element_blank(),
    strip.text.y.left = element_text(angle = 0)
  )

Let’s try to define groups based of these ECDFs:

  • Ctenophora + Rhizaria

  • All others